In [28]:
# Author: Stephen Situ
# Logistic Regression models the probability of an event taking place by having the log odds be a linear combination
# of the features. They are fitted into a sigmoid function sigma(x) = (1/(1+exp(-x)) where the output can range from 0 to 1.
# This is useful for predicting binary outcomes by defining thresholds (usually 0-0.499 & 0.500-1).
# We take a sample of breast cancer data that is diagnosed as benign ("B") or malignant ("M") (cancer) and train 
# a logistic regression model. 
# Original dataset: https://www.kaggle.com/datasets/vijayaadithyanvg/breast-cancer-prediction
In [ ]:
# Import Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
In [4]:
# read CSV
breast_cancer_data = pd.read_csv("data.csv")
In [9]:
# Head
breast_cancer_data.head
Out[9]:
<bound method NDFrame.head of            id diagnosis  Radius_mean  Texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         21.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0            0.11840           0.27760         0.30010              0.14710   
1            0.08474           0.07864         0.08690              0.07017   
2            0.10960           0.15990         0.19740              0.12790   
3            0.14250           0.28390         0.24140              0.10520   
4            0.10030           0.13280         0.19800              0.10430   
..               ...               ...             ...                  ...   
564          0.11100           0.11590         0.24390              0.13890   
565          0.09780           0.10340         0.14400              0.09791   
566          0.08455           0.10230         0.09251              0.05302   
567          0.11780           0.27700         0.35140              0.15200   
568          0.05263           0.04362         0.00000              0.00000   

     ...  radius_worst  texture_worst  perimeter_worst  area_worst  \
0    ...        25.380          17.33           184.60      2019.0   
1    ...        24.990          23.41           158.80      1956.0   
2    ...        23.570          25.53           152.50      1709.0   
3    ...        14.910          26.50            98.87       567.7   
4    ...        22.540          16.67           152.20      1575.0   
..   ...           ...            ...              ...         ...   
564  ...        25.450          26.40           166.10      2027.0   
565  ...        23.690          38.25           155.00      1731.0   
566  ...        18.980          34.12           126.70      1124.0   
567  ...        25.740          39.42           184.60      1821.0   
568  ...         9.456          30.37            59.16       268.6   

     smoothness_worst  compactness_worst  concavity_worst  \
0             0.16220            0.66560           0.7119   
1             0.12380            0.18660           0.2416   
2             0.14440            0.42450           0.4504   
3             0.20980            0.86630           0.6869   
4             0.13740            0.20500           0.4000   
..                ...                ...              ...   
564           0.14100            0.21130           0.4107   
565           0.11660            0.19220           0.3215   
566           0.11390            0.30940           0.3403   
567           0.16500            0.86810           0.9387   
568           0.08996            0.06444           0.0000   

     concave points_worst  symmetry_worst  fractal_dimension_worst  
0                  0.2654          0.4601                  0.11890  
1                  0.1860          0.2750                  0.08902  
2                  0.2430          0.3613                  0.08758  
3                  0.2575          0.6638                  0.17300  
4                  0.1625          0.2364                  0.07678  
..                    ...             ...                      ...  
564                0.2216          0.2060                  0.07115  
565                0.1628          0.2572                  0.06637  
566                0.1418          0.2218                  0.07820  
567                0.2650          0.4087                  0.12400  
568                0.0000          0.2871                  0.07039  

[569 rows x 32 columns]>
In [12]:
# describe
breast_cancer_data.describe
Out[12]:
<bound method NDFrame.describe of            id diagnosis  Radius_mean  Texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         21.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0            0.11840           0.27760         0.30010              0.14710   
1            0.08474           0.07864         0.08690              0.07017   
2            0.10960           0.15990         0.19740              0.12790   
3            0.14250           0.28390         0.24140              0.10520   
4            0.10030           0.13280         0.19800              0.10430   
..               ...               ...             ...                  ...   
564          0.11100           0.11590         0.24390              0.13890   
565          0.09780           0.10340         0.14400              0.09791   
566          0.08455           0.10230         0.09251              0.05302   
567          0.11780           0.27700         0.35140              0.15200   
568          0.05263           0.04362         0.00000              0.00000   

     ...  radius_worst  texture_worst  perimeter_worst  area_worst  \
0    ...        25.380          17.33           184.60      2019.0   
1    ...        24.990          23.41           158.80      1956.0   
2    ...        23.570          25.53           152.50      1709.0   
3    ...        14.910          26.50            98.87       567.7   
4    ...        22.540          16.67           152.20      1575.0   
..   ...           ...            ...              ...         ...   
564  ...        25.450          26.40           166.10      2027.0   
565  ...        23.690          38.25           155.00      1731.0   
566  ...        18.980          34.12           126.70      1124.0   
567  ...        25.740          39.42           184.60      1821.0   
568  ...         9.456          30.37            59.16       268.6   

     smoothness_worst  compactness_worst  concavity_worst  \
0             0.16220            0.66560           0.7119   
1             0.12380            0.18660           0.2416   
2             0.14440            0.42450           0.4504   
3             0.20980            0.86630           0.6869   
4             0.13740            0.20500           0.4000   
..                ...                ...              ...   
564           0.14100            0.21130           0.4107   
565           0.11660            0.19220           0.3215   
566           0.11390            0.30940           0.3403   
567           0.16500            0.86810           0.9387   
568           0.08996            0.06444           0.0000   

     concave points_worst  symmetry_worst  fractal_dimension_worst  
0                  0.2654          0.4601                  0.11890  
1                  0.1860          0.2750                  0.08902  
2                  0.2430          0.3613                  0.08758  
3                  0.2575          0.6638                  0.17300  
4                  0.1625          0.2364                  0.07678  
..                    ...             ...                      ...  
564                0.2216          0.2060                  0.07115  
565                0.1628          0.2572                  0.06637  
566                0.1418          0.2218                  0.07820  
567                0.2650          0.4087                  0.12400  
568                0.0000          0.2871                  0.07039  

[569 rows x 32 columns]>
In [13]:
# dtypes
breast_cancer_data.dtypes
Out[13]:
id                           int64
diagnosis                   object
Radius_mean                float64
Texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
dtype: object
In [23]:
# Change id to category
breast_cancer_data["id"] = breast_cancer_data["id"].astype("category")
In [101]:
# Quick Scatterplot Visualization
ax = sns.scatterplot(x='Radius_mean', y='perimeter_mean', data=breast_cancer_data, hue='diagnosis')
ax.set(xlabel='Mean Radius', ylabel='Mean Perimeter', title='Scatterplot of Breast Cancer Diagnosis')
Out[101]:
[Text(0.5, 0, 'Mean Radius'),
 Text(0, 0.5, 'Mean Perimeter'),
 Text(0.5, 1.0, 'Scatterplot of Breast Cancer Diagnosis')]
In [84]:
# Hot encode categorical variable
bcd = pd.get_dummies(breast_cancer_data.drop(['id'],axis=1))
bcd1 = bcd.drop(['diagnosis_B'],axis=1)
bcd1
Out[84]:
Radius_mean Texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst diagnosis_M
0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 0.2419 0.07871 ... 17.33 184.60 2019.0 0.16220 0.66560 0.7119 0.2654 0.4601 0.11890 1
1 20.57 21.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 0.1812 0.05667 ... 23.41 158.80 1956.0 0.12380 0.18660 0.2416 0.1860 0.2750 0.08902 1
2 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 0.2069 0.05999 ... 25.53 152.50 1709.0 0.14440 0.42450 0.4504 0.2430 0.3613 0.08758 1
3 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 0.2597 0.09744 ... 26.50 98.87 567.7 0.20980 0.86630 0.6869 0.2575 0.6638 0.17300 1
4 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 0.1809 0.05883 ... 16.67 152.20 1575.0 0.13740 0.20500 0.4000 0.1625 0.2364 0.07678 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 0.1726 0.05623 ... 26.40 166.10 2027.0 0.14100 0.21130 0.4107 0.2216 0.2060 0.07115 1
565 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 0.1752 0.05533 ... 38.25 155.00 1731.0 0.11660 0.19220 0.3215 0.1628 0.2572 0.06637 1
566 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 0.1590 0.05648 ... 34.12 126.70 1124.0 0.11390 0.30940 0.3403 0.1418 0.2218 0.07820 1
567 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 0.2397 0.07016 ... 39.42 184.60 1821.0 0.16500 0.86810 0.9387 0.2650 0.4087 0.12400 1
568 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 0.1587 0.05884 ... 30.37 59.16 268.6 0.08996 0.06444 0.0000 0.0000 0.2871 0.07039 0

569 rows × 31 columns

In [85]:
# Do train test split using 80/20 split and x & y
train_data, test_data = train_test_split(bcd1, test_size=0.2)
train_data_y = train_data['diagnosis_M']
train_data_x = train_data.drop(columns=['diagnosis_M'])
test_data_y = test_data['diagnosis_M']
test_data_x = test_data.drop(columns=['diagnosis_M'])
In [86]:
# Preform Logistic Regression
log_reg = LogisticRegression(max_iter=3000)
log_reg.fit(train_data_x,train_data_y)
Out[86]:
LogisticRegression(max_iter=3000)
In [87]:
# Predict on test data 
y_pred = log_reg.predict(test_data_x)
In [88]:
# Confusion Matrix gives us 95.6% accuracy
confusion_matrix(y_pred,test_data_y)
Out[88]:
array([[68,  3],
       [ 2, 41]], dtype=int64)
In [90]:
# Create new dataframe to visualize accuracy
bcd2 = test_data
bcd2["diagnosis_pred"] = y_pred
bcd2.loc[bcd2['diagnosis_M'] == bcd2['diagnosis_pred'], 'Accuracy'] = 'Correct'
bcd2.loc[bcd2['diagnosis_M'] != bcd2['diagnosis_pred'], 'Accuracy'] = 'Incorrect'
bcd2.loc[bcd2['diagnosis_M'] == 1, 'Diagnosis_true'] = 'M'
bcd2.loc[bcd2['diagnosis_M'] != 1, 'Diagnosis_true'] = 'B'
In [91]:
# Cast columns as categorical
bcd2["Accuracy"] = bcd2["Accuracy"].astype("category")
bcd2["Diagnosis_true"] = bcd2["Diagnosis_true"].astype("category")
In [94]:
# Quick view
bcd2
Out[94]:
Radius_mean Texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst diagnosis_M diagnosis_pred Accuracy Diagnosis_true
166 10.80 9.71 68.77 357.6 0.09594 0.05736 0.025310 0.016980 0.1381 0.06400 ... 0.14360 0.12570 0.10470 0.04603 0.2090 0.07699 0 0 Correct B
482 13.47 14.06 87.32 546.3 0.10710 0.11550 0.057860 0.052660 0.1779 0.06639 ... 0.13930 0.24990 0.18480 0.13350 0.3227 0.09326 0 0 Correct B
212 28.11 18.47 188.50 2499.0 0.11420 0.15160 0.320100 0.159500 0.1648 0.05525 ... 0.11420 0.15160 0.32010 0.15950 0.1648 0.05525 1 1 Correct M
562 15.22 30.62 103.40 716.9 0.10480 0.20870 0.255000 0.094290 0.2128 0.07152 ... 0.14170 0.79170 1.17000 0.23560 0.4089 0.14090 1 1 Correct M
510 11.74 14.69 76.31 426.0 0.08099 0.09661 0.067260 0.026390 0.1499 0.06758 ... 0.10730 0.27930 0.26900 0.10560 0.2604 0.09879 0 0 Correct B
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
157 16.84 19.46 108.40 880.2 0.07445 0.07223 0.051500 0.027710 0.1844 0.05268 ... 0.08774 0.17100 0.18820 0.08436 0.2527 0.05972 0 1 Incorrect B
296 10.91 12.35 69.14 363.7 0.08518 0.04721 0.012360 0.013690 0.1449 0.06031 ... 0.09312 0.07506 0.02884 0.03194 0.2143 0.06643 0 0 Correct B
396 13.51 18.89 88.10 558.1 0.10590 0.11470 0.085800 0.053810 0.1806 0.06079 ... 0.14280 0.25700 0.34380 0.14530 0.2666 0.07686 0 0 Correct B
334 12.30 19.02 77.88 464.4 0.08313 0.04202 0.007756 0.008535 0.1539 0.05945 ... 0.12220 0.09052 0.03619 0.03983 0.2554 0.07207 0 0 Correct B
495 14.87 20.21 96.12 680.9 0.09587 0.08345 0.068240 0.049510 0.1487 0.05748 ... 0.12160 0.13880 0.17000 0.10170 0.2369 0.06599 0 0 Correct B

114 rows × 34 columns

In [107]:
# Create scatter plot to visualize result
gx = sns.scatterplot(x='Radius_mean', y='perimeter_mean', data=bcd2, hue='Accuracy',style="Diagnosis_true")
gx.set(xlabel='Mean Radius', ylabel='Mean Perimeter', title='Scatterplot of Breast Cancer Logistic Regression Prediction On Test Data')
Out[107]:
[Text(0.5, 0, 'Mean Radius'),
 Text(0, 0.5, 'Mean Perimeter'),
 Text(0.5, 1.0, 'Scatterplot of Breast Cancer Logistic Regression Prediction On Test Data')]
In [106]:
# Other Parameters
print('Intercept is', log_reg.intercept_)
print('Coefficients are', log_reg.coef_)
Intercept is [-23.06487791]
Coefficients are [[-0.89510767 -0.19950075  0.19813797 -0.01709282  0.15074503  0.18661899
   0.48322892  0.24736118  0.25028976  0.02861489  0.05131253 -0.94746801
   0.1976007   0.08442126  0.02079837 -0.03942007  0.06354866  0.03564237
   0.03529912 -0.01156539 -0.48316096  0.415974    0.12568589  0.01491001
   0.29112951  0.69990637  1.48231777  0.50418023  0.67023358  0.09829928]]